Pandas Cheat Sheet#

Run Jupter:

jupyter notebook

Build book:

jupyter-book build pandas_cheatsheet.ipynb

Setup Environment#

# ensures you are running the pip version associated with the current Python kernel
import sys
!{sys.executable} -m pip install requests
!{sys.executable} -m pip install scipy
!{sys.executable} -m pip install matplotlib
!{sys.executable} -m pip install scikit-learn
!{sys.executable} -m pip install plotly
Requirement already satisfied: requests in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (2.28.2)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from requests) (2022.12.7)
Requirement already satisfied: idna<4,>=2.5 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from requests) (3.4)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from requests) (1.26.14)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from requests) (3.0.1)
^C
Collecting scipy
  Using cached scipy-1.10.0-cp310-cp310-win_amd64.whl (42.5 MB)
Requirement already satisfied: numpy<1.27.0,>=1.19.5 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from scipy) (1.24.1)
Installing collected packages: scipy
import pandas as pd
import numpy as np
import json
import requests

import datetime as dt
from dateutil.relativedelta import relativedelta

import scipy
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

import plotly.offline as pyo
# Set notebook mode to work in offline
pyo.init_notebook_mode()

Data & File I/O#

JSON Handling#

Read JSON File#

# load local JSON file and return as Python dict 
def json_load(f_path):
    f = open(f_path, )
    json_dict = json.load(f)
    f.close()
    return json_dict

Flattening (JSON Normalize)#

json_norm_dict = [
     {
         "id": 1,
         "name": "Cole Volk",
         "fitness": {"height": 130, "weight": 60},
     },
     {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
     {
         "id": 2,
         "name": "Faye Raker",
         "fitness": {"height": 130, "weight": 60},
     },
]

pd.json_normalize(json_norm_dict, max_level=0)
id name fitness
0 1.0 Cole Volk {'height': 130, 'weight': 60}
1 NaN Mark Reg {'height': 130, 'weight': 60}
2 2.0 Faye Raker {'height': 130, 'weight': 60}
pd.json_normalize(json_norm_dict, max_level=1)
id name fitness.height fitness.weight
0 1.0 Cole Volk 130 60
1 NaN Mark Reg 130 60
2 2.0 Faye Raker 130 60
json_norm_dict = [
     {
         "state": "Florida",
         "shortname": "FL",
         "info": {"governor": "Rick Scott"},
         "counties": [
             {"name": "Dade", "population": 12345},
             {"name": "Broward", "population": 40000},
             {"name": "Palm Beach", "population": 60000},
         ],
     },
     {
         "state": "Ohio",
         "shortname": "OH",
         "info": {"governor": "John Kasich"},
         "counties": [
             {"name": "Summit", "population": 1234},
             {"name": "Cuyahoga", "population": 1337},
         ],
     },
 ]

pd.json_normalize(json_norm_dict, 
                  record_path="counties",
                  meta=["state", "shortname", ["info", "governor"]])
name population state shortname info.governor
0 Dade 12345 Florida FL Rick Scott
1 Broward 40000 Florida FL Rick Scott
2 Palm Beach 60000 Florida FL Rick Scott
3 Summit 1234 Ohio OH John Kasich
4 Cuyahoga 1337 Ohio OH John Kasich

Load JSON via REST API#

# make GET request to REST API and return as Python dict
def run_json_get(url, params={}, headers={}, print_summ=True, print_resp=False):

    if print_summ:
        print("/"*69)
        print("GET Address: %s\nHeaders %s:\nParameters %s:" % (url, repr(headers), repr(params)))

    response = requests.get(url, params=params, headers=headers)
    status_code = response.status_code

    if status_code == 200:
        json_dict = response.json()
    else:
        json_dict = {}

    if print_summ:
        print("Status Code: %d" % response.status_code)
        # print("Message: %d" % response.messa)
        if type(json_dict) == dict:
            print("Response Keys: %s\n" % json_dict.keys())
    
    if print_resp:
        print("Response: %s\n" % json_dict)

    return json_dict, status_code

Read SQL Query w/ Params#

def read_sql(f_path, params={}):
    f = open(f_path, "r")
    query = f.read()
    f.close()
    
    if params:
        query = query.format(**params)
    
    return query

I/O tests#

# read JSON
test_json_path = "C:/Users/wsaye/PycharmProjects/CashAppInterview/templates/data.json"
print(json_load(test_json_path))
{'api_key': 'fdghdfghfgfg'}
# read SQL (no params)
test_sql_path = "C:/Users/wsaye/PycharmProjects/CashAppInterview/templates/query.sql"
print(read_sql(test_sql_path))
SELECT *
FROM my_table
WHERE id = 1
# read SQL (w/ params)
test_sql_params_path = "C:/Users/wsaye/PycharmProjects/CashAppInterview/templates/query_w_params.sql"
test_params = {"my_id": 102393, "max_date": "2000/01/01"}
print(read_sql(test_sql_params_path, params=test_params))
SELECT *
FROM my_table
WHERE id = 102393 OR date = '2000/01/01'

Sample REST Dataset (CoinMetrics)#

def get_asset_metrics(assets, metrics, freq, alt_params={}, page_size=10000, print_summ=True):
    # freq options 1b, 1s, 1m, 1h, 1d
    # for 'start_time' and 'end_time', formats "2006-01-20T00:00:00Z" and "2006-01-20" are supported among others
    # https://docs.coinmetrics.io/api/v4#operation/getTimeseriesAssetMetrics
    # https://docs.coinmetrics.io/info/metrics

    assets_str = ", ".join(assets)
    metrics_str = ", ".join(metrics)
    
    api_root = 'https://community-api.coinmetrics.io/v4'
    data_key = "data"

    url = "/".join([api_root, "timeseries/asset-metrics"])
    params = {'assets': assets_str, 'metrics': metrics_str, 'frequency': freq,
              'page_size': page_size}
    params.update(alt_params)
    
    result_dict, status_code = run_json_get(url, params=params, headers={}, print_summ=print_summ)

    result_df = pd.DataFrame(result_dict[data_key])
    result_df.sort_values(by=["asset", "time"], inplace=True)
    result_df.reset_index(inplace=True, drop=True)

    for m in metrics:
        result_df[m] = result_df[m].astype(float)

    return result_df
def load_asset_metric_data(pull_new):
    if pull_new:
        df = get_asset_metrics(test_assets, test_metrics, test_freq, print_summ=False)
        df.to_csv(test_df_cache, index=False)
    else:
        df = pd.read_csv(test_df_cache)
    return df

Get data w/ cache#

test_assets = ['btc', 'eth']
test_metrics = ['AdrActCnt', 'PriceUSD']
test_freq = '1d'
test_df_cache = "C:/Users/wsaye/PycharmProjects/CashAppInterview/data/cm_test_data.csv"
test_df = load_asset_metric_data(True)
test_df = test_df.dropna(subset=test_metrics).reset_index(drop=True)

test_df["datetime"] = pd.to_datetime(test_df["time"], utc=True) # str timestamp to datetime
test_df["dayname"] = test_df["datetime"].dt.day_name()
test_df["date"] = pd.to_datetime(test_df["time"], utc=True).dt.date # datetime to date
test_df

Pandas Options & Settings#

Docs and available options here

pd.get_option("display.max_rows")
pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", 10)

Summarize Dataframe#

General Info#

test_df.describe()  # summary stats of columns
AdrActCnt PriceUSD
count 7.213000e+03 7213.000000
mean 4.299169e+05 5678.012019
std 3.559791e+05 12124.585363
min 4.080000e+02 0.050541
25% 8.679800e+04 117.783464
50% 4.062670e+05 467.321788
75% 6.681570e+05 4335.593450
max 7.157228e+06 67541.755508
test_df.info()  # dataframe schema info, column types
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7213 entries, 0 to 7212
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   asset      7213 non-null   object 
 1   time       7213 non-null   object 
 2   AdrActCnt  7213 non-null   float64
 3   PriceUSD   7213 non-null   float64
dtypes: float64(2), object(2)
memory usage: 225.5+ KB
test_df.dtypes
asset         object
time          object
AdrActCnt    float64
PriceUSD     float64
dtype: object
test_df.head()
asset time AdrActCnt PriceUSD
0 btc 2010-07-18T00:00:00.000000000Z 860.0 0.085840
1 btc 2010-07-19T00:00:00.000000000Z 929.0 0.080800
2 btc 2010-07-20T00:00:00.000000000Z 936.0 0.074736
3 btc 2010-07-21T00:00:00.000000000Z 784.0 0.079193
4 btc 2010-07-22T00:00:00.000000000Z 594.0 0.058470
print(type(test_df.loc[0, "time"]))  # type of particular entry
<class 'str'>
test_df.nlargest(5, "PriceUSD")
# test_df.nsmallest(5, "PriceUSD")
asset time AdrActCnt PriceUSD
4131 btc 2021-11-08T00:00:00.000000000Z 1018796.0 67541.755508
4132 btc 2021-11-09T00:00:00.000000000Z 1195638.0 67095.585671
4112 btc 2021-10-20T00:00:00.000000000Z 977215.0 66061.796564
4137 btc 2021-11-14T00:00:00.000000000Z 806451.0 65032.225655
4134 btc 2021-11-11T00:00:00.000000000Z 1037951.0 64962.931294
test_df["asset"].unique()
# test_df["asset"].nunique()
array(['btc', 'eth'], dtype=object)

Crosstab#

cross_df = test_df.loc[test_df["asset"]== "btc", ["datetime", "dayname", "PriceUSD"]].copy().dropna()
cross_df = cross_df.sort_values(by="datetime")
cross_df["7d_SMA"] = cross_df["PriceUSD"].rolling(7).mean()
cross_df["beating_SMA"] = cross_df["PriceUSD"] > cross_df["7d_SMA"]
cross_df["return"] = cross_df["PriceUSD"].pct_change()
cross_df.dropna(inplace=True)
cross_df
datetime dayname PriceUSD 7d_SMA beating_SMA return
6 2010-07-24 00:00:00+00:00 Saturday 0.054540 0.070596 False -0.099894
7 2010-07-25 00:00:00+00:00 Sunday 0.050541 0.065553 False -0.073329
8 2010-07-26 00:00:00+00:00 Monday 0.056000 0.062010 False 0.108020
9 2010-07-27 00:00:00+00:00 Tuesday 0.058622 0.059708 False 0.046822
10 2010-07-28 00:00:00+00:00 Wednesday 0.058911 0.056811 True 0.004931
... ... ... ... ... ... ...
4525 2022-12-07 00:00:00+00:00 Wednesday 16848.251824 16991.106107 False -0.013020
4526 2022-12-08 00:00:00+00:00 Thursday 17231.456322 17029.701038 True 0.022744
4527 2022-12-09 00:00:00+00:00 Friday 17136.149001 17038.421018 True -0.005531
4528 2022-12-10 00:00:00+00:00 Saturday 17124.214680 17070.522061 True -0.000696
4529 2022-12-11 00:00:00+00:00 Sunday 17094.360659 17066.614300 True -0.001743

4524 rows × 6 columns

pd.crosstab(cross_df['beating_SMA'], cross_df['dayname'])
dayname Friday Monday Saturday Sunday Thursday Tuesday Wednesday
beating_SMA
False 269 285 292 293 276 297 282
True 377 361 355 354 370 349 364
pd.crosstab(cross_df['beating_SMA'], cross_df['dayname'], normalize=True)
dayname Friday Monday Saturday Sunday Thursday Tuesday Wednesday
beating_SMA
False 0.059461 0.062997 0.064545 0.064766 0.061008 0.065650 0.062334
True 0.083333 0.079797 0.078470 0.078249 0.081786 0.077144 0.080460
pd.crosstab(cross_df['beating_SMA'], cross_df['dayname'], values=cross_df['return'], aggfunc=np.mean)
dayname Friday Monday Saturday Sunday Thursday Tuesday Wednesday
beating_SMA
False -0.018046 -0.020816 -0.012431 -0.009124 -0.020508 -0.017438 -0.01737
True 0.019836 0.025968 0.017233 0.008952 0.022115 0.025215 0.02249

Sort/ Rank#

sort_df = test_df[["date", "asset", "PriceUSD"]].copy()
sort_df['price_rank'] = sort_df["PriceUSD"].rank(ascending=True, pct=False)
sort_df['price_pct'] = sort_df["PriceUSD"].rank(ascending=True, pct=True)
sort_df
date asset PriceUSD price_rank price_pct
0 2010-07-18 btc 0.085840 83.0 0.011507
1 2010-07-19 btc 0.080800 81.0 0.011230
2 2010-07-20 btc 0.074736 79.0 0.010952
3 2010-07-21 btc 0.079193 80.0 0.011091
4 2010-07-22 btc 0.058470 5.0 0.000693
... ... ... ... ... ...
7208 2022-12-07 eth 1232.552454 4488.0 0.622210
7209 2022-12-08 eth 1281.633052 4529.0 0.627894
7210 2022-12-09 eth 1263.208619 4516.0 0.626092
7211 2022-12-10 eth 1266.771442 4518.0 0.626369
7212 2022-12-11 eth 1263.069667 4515.0 0.625953

7213 rows × 5 columns

sort_df.sort_values(by="price_rank", ascending=False)
date asset PriceUSD price_rank price_pct
4131 2021-11-08 btc 67541.755508 7213.0 1.000000
4132 2021-11-09 btc 67095.585671 7212.0 0.999861
4112 2021-10-20 btc 66061.796564 7211.0 0.999723
4137 2021-11-14 btc 65032.225655 7210.0 0.999584
4134 2021-11-11 btc 64962.931294 7209.0 0.999445
... ... ... ... ... ...
4 2010-07-22 btc 0.058470 5.0 0.000693
17 2010-08-04 btc 0.057016 4.0 0.000555
8 2010-07-26 btc 0.056000 3.0 0.000416
6 2010-07-24 btc 0.054540 2.0 0.000277
7 2010-07-25 btc 0.050541 1.0 0.000139

7213 rows × 5 columns

Cleaning#

Deleting Rows/Columns here

Replace#

replace_df = test_df[["date", "asset", "dayname"]].copy()
# replace_df.replace("Sunday", "Sun")
replace_df.replace({"Sunday": "S",  "Monday": "M", "Tuesday": "T"})
date asset dayname
0 2010-07-18 btc S
1 2010-07-19 btc M
2 2010-07-20 btc T
3 2010-07-21 btc Wednesday
4 2010-07-22 btc Thursday
... ... ... ...
7208 2022-12-07 eth Wednesday
7209 2022-12-08 eth Thursday
7210 2022-12-09 eth Friday
7211 2022-12-10 eth Saturday
7212 2022-12-11 eth S

7213 rows × 3 columns

Drop/Fill NA()#

cleaning_df = test_df[["date", "asset", "PriceUSD"]].pivot(index="date", columns="asset", values="PriceUSD")
cleaning_df
asset btc eth
date
2010-07-18 0.085840 NaN
2010-07-19 0.080800 NaN
2010-07-20 0.074736 NaN
2010-07-21 0.079193 NaN
2010-07-22 0.058470 NaN
... ... ...
2022-12-07 16848.251824 1232.552454
2022-12-08 17231.456322 1281.633052
2022-12-09 17136.149001 1263.208619
2022-12-10 17124.214680 1266.771442
2022-12-11 17094.360659 1263.069667

4530 rows × 2 columns

# cleaning_df.dropna()  # drops N/A looking in all columns
cleaning_df.dropna(subset=["eth"])  # drops N/A in subset only
asset btc eth
date
2015-08-08 261.450276 1.199990
2015-08-09 266.342020 1.199990
2015-08-10 264.928825 1.199990
2015-08-11 271.421736 0.990000
2015-08-12 268.143868 1.288000
... ... ...
2022-12-07 16848.251824 1232.552454
2022-12-08 17231.456322 1281.633052
2022-12-09 17136.149001 1263.208619
2022-12-10 17124.214680 1266.771442
2022-12-11 17094.360659 1263.069667

2683 rows × 2 columns

cleaning_df.fillna(-1)
asset btc eth
date
2010-07-18 0.085840 -1.000000
2010-07-19 0.080800 -1.000000
2010-07-20 0.074736 -1.000000
2010-07-21 0.079193 -1.000000
2010-07-22 0.058470 -1.000000
... ... ...
2022-12-07 16848.251824 1232.552454
2022-12-08 17231.456322 1281.633052
2022-12-09 17136.149001 1263.208619
2022-12-10 17124.214680 1266.771442
2022-12-11 17094.360659 1263.069667

4530 rows × 2 columns

cleaning_df.fillna(method="ffill")
asset btc eth
date
2010-07-18 0.085840 NaN
2010-07-19 0.080800 NaN
2010-07-20 0.074736 NaN
2010-07-21 0.079193 NaN
2010-07-22 0.058470 NaN
... ... ...
2022-12-07 16848.251824 1232.552454
2022-12-08 17231.456322 1281.633052
2022-12-09 17136.149001 1263.208619
2022-12-10 17124.214680 1266.771442
2022-12-11 17094.360659 1263.069667

4530 rows × 2 columns

cleaning_df.fillna(method="bfill")
asset btc eth
date
2010-07-18 0.085840 1.199990
2010-07-19 0.080800 1.199990
2010-07-20 0.074736 1.199990
2010-07-21 0.079193 1.199990
2010-07-22 0.058470 1.199990
... ... ...
2022-12-07 16848.251824 1232.552454
2022-12-08 17231.456322 1281.633052
2022-12-09 17136.149001 1263.208619
2022-12-10 17124.214680 1266.771442
2022-12-11 17094.360659 1263.069667

4530 rows × 2 columns

# setup df for interpolation
interp_df = cleaning_df.iloc[cleaning_df.shape[0] - 5:, :].copy()
interp_df["btc_og"] = interp_df["btc"]
interp_df["eth_og"] = interp_df["eth"]
interp_df.iloc[1, 0:2] = [np.nan ,np.nan]

interp_df.interpolate(method="linear")
asset btc eth btc_og eth_og
date
2022-12-07 16848.251824 1232.552454 16848.251824 1232.552454
2022-12-08 16992.200413 1247.880537 17231.456322 1281.633052
2022-12-09 17136.149001 1263.208619 17136.149001 1263.208619
2022-12-10 17124.214680 1266.771442 17124.214680 1266.771442
2022-12-11 17094.360659 1263.069667 17094.360659 1263.069667

Selecting/Sampling#

test_df.select_dtypes(include='float64')
AdrActCnt PriceUSD
0 860.0 0.085840
1 929.0 0.080800
2 936.0 0.074736
3 784.0 0.079193
4 594.0 0.058470
... ... ...
7208 517421.0 1232.552454
7209 528927.0 1281.633052
7210 1505652.0 1263.208619
7211 733795.0 1266.771442
7212 723883.0 1263.069667

7213 rows × 2 columns

# test_df.sample(n = 200)
test_df.sample(frac = 0.25, random_state=42)
asset time AdrActCnt PriceUSD datetime dayname date
308 btc 2011-05-22T00:00:00.000000000Z 10846.0 6.711956 2011-05-22 00:00:00+00:00 Sunday 2011-05-22
381 btc 2011-08-03T00:00:00.000000000Z 18614.0 9.286357 2011-08-03 00:00:00+00:00 Wednesday 2011-08-03
5716 eth 2018-11-06T00:00:00.000000000Z 278348.0 218.089756 2018-11-06 00:00:00+00:00 Tuesday 2018-11-06
2312 btc 2016-11-15T00:00:00.000000000Z 651339.0 712.347153 2016-11-15 00:00:00+00:00 Tuesday 2016-11-15
251 btc 2011-03-26T00:00:00.000000000Z 4838.0 0.855200 2011-03-26 00:00:00+00:00 Saturday 2011-03-26
... ... ... ... ... ... ... ...
5314 eth 2017-09-30T00:00:00.000000000Z 150248.0 301.653575 2017-09-30 00:00:00+00:00 Saturday 2017-09-30
5546 eth 2018-05-20T00:00:00.000000000Z 389190.0 713.591691 2018-05-20 00:00:00+00:00 Sunday 2018-05-20
6894 eth 2022-01-27T00:00:00.000000000Z 537712.0 2412.624470 2022-01-27 00:00:00+00:00 Thursday 2022-01-27
2118 btc 2016-05-05T00:00:00.000000000Z 455031.0 449.336072 2016-05-05 00:00:00+00:00 Thursday 2016-05-05
1235 btc 2013-12-04T00:00:00.000000000Z 186429.0 1134.932231 2013-12-04 00:00:00+00:00 Wednesday 2013-12-04

1803 rows × 7 columns

Boolean Selection#

bool_df = test_df[["date", "asset", "PriceUSD"]].pivot(index="date", columns="asset", values="PriceUSD")
bool_df
asset btc eth
date
2010-07-18 0.085840 NaN
2010-07-19 0.080800 NaN
2010-07-20 0.074736 NaN
2010-07-21 0.079193 NaN
2010-07-22 0.058470 NaN
... ... ...
2022-12-07 16848.251824 1232.552454
2022-12-08 17231.456322 1281.633052
2022-12-09 17136.149001 1263.208619
2022-12-10 17124.214680 1266.771442
2022-12-11 17094.360659 1263.069667

4530 rows × 2 columns

# returns Series of same shape w/ np.NaN at failing rows (default)
# bool_df['PriceUSD'].where(bool_df['PriceUSD'] > 10**4)  # returns np.Nan in failing rows
bool_df['eth'].where(bool_df['eth'] > 10**3, 0)  # returns 0 in failing rows
date
2010-07-18       0.000000
2010-07-19       0.000000
2010-07-20       0.000000
2010-07-21       0.000000
2010-07-22       0.000000
                 ...     
2022-12-07    1232.552454
2022-12-08    1281.633052
2022-12-09    1263.208619
2022-12-10    1266.771442
2022-12-11    1263.069667
Name: eth, Length: 4530, dtype: float64
test_df["asset"].isin(["btc"])
0        True
1        True
2        True
3        True
4        True
        ...  
7208    False
7209    False
7210    False
7211    False
7212    False
Name: asset, Length: 7213, dtype: bool
na_check_df = bool_df.isna()
na_check_series = na_check_df.any(axis=1)  # aggregate booleans
bool_df.loc[na_check_series, :]
asset btc eth
date
2010-07-18 0.085840 NaN
2010-07-19 0.080800 NaN
2010-07-20 0.074736 NaN
2010-07-21 0.079193 NaN
2010-07-22 0.058470 NaN
... ... ...
2015-08-03 282.185052 NaN
2015-08-04 285.286617 NaN
2015-08-05 282.338887 NaN
2015-08-06 278.995749 NaN
2015-08-07 279.488715 NaN

1847 rows × 2 columns

bool_df.loc[(bool_df['eth'] > 10**3), :]
asset btc eth
date
2018-01-06 17103.589280 1006.157475
2018-01-07 16231.694999 1102.889537
2018-01-08 14937.415089 1134.699060
2018-01-09 14378.586217 1288.406875
2018-01-10 14669.088266 1231.767295
... ... ...
2022-12-07 16848.251824 1232.552454
2022-12-08 17231.456322 1281.633052
2022-12-09 17136.149001 1263.208619
2022-12-10 17124.214680 1266.771442
2022-12-11 17094.360659 1263.069667

731 rows × 2 columns

Boolean Operators#

bool_df.loc[~(bool_df['eth'] > 10**3), :]  # NOT
asset btc eth
date
2010-07-18 0.085840 NaN
2010-07-19 0.080800 NaN
2010-07-20 0.074736 NaN
2010-07-21 0.079193 NaN
2010-07-22 0.058470 NaN
... ... ...
2020-12-31 29022.671413 739.025850
2021-01-01 29380.693733 730.914321
2021-01-02 32022.681058 775.296622
2021-01-03 33277.835305 990.365325
2022-06-18 19013.867254 992.790097

3799 rows × 2 columns

bool_df.loc[(bool_df['eth'] > 10**3) & (bool_df['eth'] > 10), :]  # AND
asset btc eth
date
2018-01-06 17103.589280 1006.157475
2018-01-07 16231.694999 1102.889537
2018-01-08 14937.415089 1134.699060
2018-01-09 14378.586217 1288.406875
2018-01-10 14669.088266 1231.767295
... ... ...
2022-12-07 16848.251824 1232.552454
2022-12-08 17231.456322 1281.633052
2022-12-09 17136.149001 1263.208619
2022-12-10 17124.214680 1266.771442
2022-12-11 17094.360659 1263.069667

731 rows × 2 columns

bool_df.loc[(bool_df['eth'] > 10**3) | (bool_df['eth'] > 10), :]  # OR
asset btc eth
date
2016-03-05 397.314454 10.751006
2016-03-06 403.705187 10.984407
2016-03-09 412.540704 11.817099
2016-03-10 416.340929 11.165651
2016-03-11 419.511934 11.122905
... ... ...
2022-12-07 16848.251824 1232.552454
2022-12-08 17231.456322 1281.633052
2022-12-09 17136.149001 1263.208619
2022-12-10 17124.214680 1266.771442
2022-12-11 17094.360659 1263.069667

2380 rows × 2 columns

Datetime#

Python datetime <-> string formatting here

Datetime to string#

Python:

test_date_str = "2022-01-01"
print(dt.datetime.strptime(test_date_str, "%Y-%m-%d"))
print(dt.datetime.strptime(test_date_str, "%Y-%m-%d").date())
2022-01-01 00:00:00
2022-01-01

Pandas:

test_df["datetime"] = pd.to_datetime(test_df["time"], utc=True) # str timestamp to datetime
test_df['datetime_alt'] = pd.to_datetime(test_df["time"], format='%Y-%m-%dT%H:%M:%S.%fZ', utc=True)
test_df["date"] = pd.to_datetime(test_df["time"], utc=True).dt.date # datetime to date
test_df['datetime_str'] = test_df["datetime"].dt.strftime('%Y-%m-%dT%H:%M:%S.%fZ')  # datetime to str
test_df['date_str'] = pd.to_datetime(test_df["date"]).dt.strftime('%Y-%m-%d')  # date to str

test_df[["datetime", "datetime_alt", "date", "datetime_str", "date_str"]]
datetime datetime_alt date datetime_str date_str
0 2010-07-18 00:00:00+00:00 2010-07-18 00:00:00+00:00 2010-07-18 2010-07-18T00:00:00.000000Z 2010-07-18
1 2010-07-19 00:00:00+00:00 2010-07-19 00:00:00+00:00 2010-07-19 2010-07-19T00:00:00.000000Z 2010-07-19
2 2010-07-20 00:00:00+00:00 2010-07-20 00:00:00+00:00 2010-07-20 2010-07-20T00:00:00.000000Z 2010-07-20
3 2010-07-21 00:00:00+00:00 2010-07-21 00:00:00+00:00 2010-07-21 2010-07-21T00:00:00.000000Z 2010-07-21
4 2010-07-22 00:00:00+00:00 2010-07-22 00:00:00+00:00 2010-07-22 2010-07-22T00:00:00.000000Z 2010-07-22
... ... ... ... ... ...
7208 2022-12-07 00:00:00+00:00 2022-12-07 00:00:00+00:00 2022-12-07 2022-12-07T00:00:00.000000Z 2022-12-07
7209 2022-12-08 00:00:00+00:00 2022-12-08 00:00:00+00:00 2022-12-08 2022-12-08T00:00:00.000000Z 2022-12-08
7210 2022-12-09 00:00:00+00:00 2022-12-09 00:00:00+00:00 2022-12-09 2022-12-09T00:00:00.000000Z 2022-12-09
7211 2022-12-10 00:00:00+00:00 2022-12-10 00:00:00+00:00 2022-12-10 2022-12-10T00:00:00.000000Z 2022-12-10
7212 2022-12-11 00:00:00+00:00 2022-12-11 00:00:00+00:00 2022-12-11 2022-12-11T00:00:00.000000Z 2022-12-11

7213 rows × 5 columns

Datetime Altering#

Python:

test_date_str = "2022-01-01"
test_dt = dt.datetime.strptime(test_date_str, "%Y-%m-%d")
print(test_dt + dt.timedelta(days=2))
print(test_dt + relativedelta(years=5, months=4))
2022-01-03 00:00:00
2027-05-01 00:00:00

Pandas:

# (also pd.Timedelta)
test_df["date_offset"] = test_df["date"] + pd.DateOffset(years=2, months=2, days=1)

test_df[["date", "date_offset"]]
date date_offset
0 2010-07-18 2012-09-19
1 2010-07-19 2012-09-20
2 2010-07-20 2012-09-21
3 2010-07-21 2012-09-22
4 2010-07-22 2012-09-23
... ... ...
7208 2022-12-07 2025-02-08
7209 2022-12-08 2025-02-09
7210 2022-12-09 2025-02-10
7211 2022-12-10 2025-02-11
7212 2022-12-11 2025-02-12

7213 rows × 2 columns

Date Parts (Week & Weekday)#

Python:

test_date_str = "2022-01-01"
test_dt = dt.datetime.strptime(test_date_str, "%Y-%m-%d")
print(test_dt.isocalendar().week)
print(test_dt.weekday())
print(test_dt.strftime('%A'))  # get day name
52
5
Saturday

Pandas:

test_df["week"] = test_df["datetime"].dt.isocalendar().week
test_df["weekday"] = test_df["datetime"].dt.weekday
test_df["dayname"] = test_df["datetime"].dt.day_name()
test_df["monthname"] = test_df["datetime"].dt.month_name()
test_df["year"] = test_df["datetime"].dt.year

test_df[["date", "week", "weekday", "dayname", "monthname", "year"]]
date week weekday dayname monthname year
0 2010-07-18 28 6 Sunday July 2010
1 2010-07-19 29 0 Monday July 2010
2 2010-07-20 29 1 Tuesday July 2010
3 2010-07-21 29 2 Wednesday July 2010
4 2010-07-22 29 3 Thursday July 2010
... ... ... ... ... ... ...
7208 2022-12-07 49 2 Wednesday December 2022
7209 2022-12-08 49 3 Thursday December 2022
7210 2022-12-09 49 4 Friday December 2022
7211 2022-12-10 49 5 Saturday December 2022
7212 2022-12-11 49 6 Sunday December 2022

7213 rows × 6 columns

Numerical#

num_df = test_df.loc[3000:, ["date", "asset", "PriceUSD"]].copy()
num_df["PriceUSD_rnd"] = num_df["PriceUSD"].round()
num_df["PriceUSD_rnd1"] = num_df["PriceUSD"].round(1)
num_df["PriceUSD_floor"] = np.floor(num_df["PriceUSD"])
num_df["PriceUSD_ceil"] = np.ceil(num_df["PriceUSD"])
num_df
date asset PriceUSD PriceUSD_rnd PriceUSD_rnd1 PriceUSD_floor PriceUSD_ceil
3000 2018-10-04 btc 6545.285668 6545.0 6545.3 6545.0 6546.0
3001 2018-10-05 btc 6585.149580 6585.0 6585.1 6585.0 6586.0
3002 2018-10-06 btc 6550.478316 6550.0 6550.5 6550.0 6551.0
3003 2018-10-07 btc 6564.568260 6565.0 6564.6 6564.0 6565.0
3004 2018-10-08 btc 6604.685274 6605.0 6604.7 6604.0 6605.0
... ... ... ... ... ... ... ...
7208 2022-12-07 eth 1232.552454 1233.0 1232.6 1232.0 1233.0
7209 2022-12-08 eth 1281.633052 1282.0 1281.6 1281.0 1282.0
7210 2022-12-09 eth 1263.208619 1263.0 1263.2 1263.0 1264.0
7211 2022-12-10 eth 1266.771442 1267.0 1266.8 1266.0 1267.0
7212 2022-12-11 eth 1263.069667 1263.0 1263.1 1263.0 1264.0

4213 rows × 7 columns

Transforms#

Indexes#

  • df.set_index(keys, drop=True, verify_integrity=False)

  • df.reset_index(drop=False)

  • df.reindex()

Pivot & Melt#

  • pd.unstack() - pivot multilevel index

Pivot to MultiIndex#

pivot_df = test_df.pivot(index="date", columns="asset", values=['AdrActCnt', 'PriceUSD'])
pivot_df.reset_index(drop=False, inplace=True)
pivot_df
date AdrActCnt PriceUSD
asset btc eth btc eth
0 2010-07-18 860.0 NaN 0.085840 NaN
1 2010-07-19 929.0 NaN 0.080800 NaN
2 2010-07-20 936.0 NaN 0.074736 NaN
3 2010-07-21 784.0 NaN 0.079193 NaN
4 2010-07-22 594.0 NaN 0.058470 NaN
... ... ... ... ... ...
4525 2022-12-07 899850.0 517421.0 16848.251824 1232.552454
4526 2022-12-08 905789.0 528927.0 17231.456322 1281.633052
4527 2022-12-09 946767.0 1505652.0 17136.149001 1263.208619
4528 2022-12-10 909506.0 733795.0 17124.214680 1266.771442
4529 2022-12-11 781176.0 723883.0 17094.360659 1263.069667

4530 rows × 5 columns

Melt from MultiIndex#

# pivot_df = pd.melt(pivot_df, col_level=0, id_vars=["date"])
pivot_df = pd.melt(pivot_df, id_vars=[("date", "")])
pivot_df.columns = ["date", "metric", "asset", "value"]
pivot_df
date metric asset value
0 2010-07-18 AdrActCnt btc 860.000000
1 2010-07-19 AdrActCnt btc 929.000000
2 2010-07-20 AdrActCnt btc 936.000000
3 2010-07-21 AdrActCnt btc 784.000000
4 2010-07-22 AdrActCnt btc 594.000000
... ... ... ... ...
18115 2022-12-07 PriceUSD eth 1232.552454
18116 2022-12-08 PriceUSD eth 1281.633052
18117 2022-12-09 PriceUSD eth 1263.208619
18118 2022-12-10 PriceUSD eth 1266.771442
18119 2022-12-11 PriceUSD eth 1263.069667

18120 rows × 4 columns

Pivot back to OG (single Index)#

pivot_df = pivot_df.pivot(index=["date", "asset"], columns="metric", values="value")
pivot_df.columns = pivot_df.columns.rename("")
pivot_df.reset_index(drop=False, inplace=True)
pivot_df
date asset AdrActCnt PriceUSD
0 2010-07-18 btc 860.0 0.085840
1 2010-07-18 eth NaN NaN
2 2010-07-19 btc 929.0 0.080800
3 2010-07-19 eth NaN NaN
4 2010-07-20 btc 936.0 0.074736
... ... ... ... ...
9055 2022-12-09 eth 1505652.0 1263.208619
9056 2022-12-10 btc 909506.0 17124.214680
9057 2022-12-10 eth 733795.0 1266.771442
9058 2022-12-11 btc 781176.0 17094.360659
9059 2022-12-11 eth 723883.0 1263.069667

9060 rows × 4 columns

Pivot to Date Index (Fill missing dates)#

date_rng = pd.date_range(test_df["date"].min(), test_df["date"].max())
pivot_fill_df = test_df.pivot(index="date", columns="asset", values='PriceUSD')
# pivot_fill_df = pivot_fill_df.drop(labels=[1, 3, 4524], axis=0)  # drop by index value
pivot_fill_df = pivot_fill_df.drop(pivot_fill_df.index[[1, 3, 4524]])  # drop by row num
pivot_fill_df = pivot_fill_df.reindex(date_rng, fill_value=np.nan)
pivot_fill_df
asset btc eth
2010-07-18 0.085840 NaN
2010-07-19 NaN NaN
2010-07-20 0.074736 NaN
2010-07-21 NaN NaN
2010-07-22 0.058470 NaN
... ... ...
2022-12-07 16848.251824 1232.552454
2022-12-08 17231.456322 1281.633052
2022-12-09 17136.149001 1263.208619
2022-12-10 17124.214680 1266.771442
2022-12-11 17094.360659 1263.069667

4530 rows × 2 columns

Join & Merge#

Merge:

  • can join on indices or columns

  • validate - check 1:1, 1:many, etc. (also available for join)

  • indicator - produces additional column to indicate “left_only”, “right_only”, or “both”

join_merge_df = test_df.loc[3000:, ["date", "datetime", "asset", 'AdrActCnt', 'PriceUSD']].copy()
join_merge_df = join_merge_df.sort_values(by="date").reset_index(drop=True)
join_merge_df
date datetime asset AdrActCnt PriceUSD
0 2015-08-08 2015-08-08 00:00:00+00:00 eth 1208.0 1.199990
1 2015-08-09 2015-08-09 00:00:00+00:00 eth 1113.0 1.199990
2 2015-08-10 2015-08-10 00:00:00+00:00 eth 1430.0 1.199990
3 2015-08-11 2015-08-11 00:00:00+00:00 eth 2697.0 0.990000
4 2015-08-12 2015-08-12 00:00:00+00:00 eth 1219.0 1.288000
... ... ... ... ... ...
4208 2022-12-09 2022-12-09 00:00:00+00:00 eth 1505652.0 1263.208619
4209 2022-12-10 2022-12-10 00:00:00+00:00 eth 733795.0 1266.771442
4210 2022-12-10 2022-12-10 00:00:00+00:00 btc 909506.0 17124.214680
4211 2022-12-11 2022-12-11 00:00:00+00:00 btc 781176.0 17094.360659
4212 2022-12-11 2022-12-11 00:00:00+00:00 eth 723883.0 1263.069667

4213 rows × 5 columns

join_cols = ["date", "asset"]
join_merge_df1 = join_merge_df.loc[:3000, join_cols + ["AdrActCnt"]]
join_merge_df2 = join_merge_df.loc[:, join_cols + ["PriceUSD"]]
print(join_merge_df1)
print(join_merge_df2)
            date asset  AdrActCnt
0     2015-08-08   eth     1208.0
1     2015-08-09   eth     1113.0
2     2015-08-10   eth     1430.0
3     2015-08-11   eth     2697.0
4     2015-08-12   eth     1219.0
...          ...   ...        ...
2996  2021-04-12   btc  1126723.0
2997  2021-04-13   eth   641476.0
2998  2021-04-13   btc  1178027.0
2999  2021-04-14   eth   662089.0
3000  2021-04-14   btc  1149773.0

[3001 rows x 3 columns]
            date asset      PriceUSD
0     2015-08-08   eth      1.199990
1     2015-08-09   eth      1.199990
2     2015-08-10   eth      1.199990
3     2015-08-11   eth      0.990000
4     2015-08-12   eth      1.288000
...          ...   ...           ...
4208  2022-12-09   eth   1263.208619
4209  2022-12-10   eth   1266.771442
4210  2022-12-10   btc  17124.214680
4211  2022-12-11   btc  17094.360659
4212  2022-12-11   eth   1263.069667

[4213 rows x 3 columns]
joined_df = join_merge_df1.join(join_merge_df2.set_index(keys=join_cols), how="outer", on=join_cols)
joined_df
date asset AdrActCnt PriceUSD
0 2015-08-08 eth 1208.0 1.199990
1 2015-08-09 eth 1113.0 1.199990
2 2015-08-10 eth 1430.0 1.199990
3 2015-08-11 eth 2697.0 0.990000
4 2015-08-12 eth 1219.0 1.288000
... ... ... ... ...
3000 2022-12-09 eth NaN 1263.208619
3000 2022-12-10 eth NaN 1266.771442
3000 2022-12-10 btc NaN 17124.214680
3000 2022-12-11 btc NaN 17094.360659
3000 2022-12-11 eth NaN 1263.069667

4213 rows × 4 columns

merged_df = join_merge_df1.merge(join_merge_df2, how="outer", on=join_cols, indicator=True)
merged_df
date asset AdrActCnt PriceUSD _merge
0 2015-08-08 eth 1208.0 1.199990 both
1 2015-08-09 eth 1113.0 1.199990 both
2 2015-08-10 eth 1430.0 1.199990 both
3 2015-08-11 eth 2697.0 0.990000 both
4 2015-08-12 eth 1219.0 1.288000 both
... ... ... ... ... ...
4208 2022-12-09 eth NaN 1263.208619 right_only
4209 2022-12-10 eth NaN 1266.771442 right_only
4210 2022-12-10 btc NaN 17124.214680 right_only
4211 2022-12-11 btc NaN 17094.360659 right_only
4212 2022-12-11 eth NaN 1263.069667 right_only

4213 rows × 5 columns

Explode#

explode_df = pd.DataFrame({"city": ['A', 'B', 'C'],
                   "day1": [22, 25, 21],
                   'day2':[31, 12, 67],
                   'day3': [27, 20, 15],
                   'day4': [34, 37, [41, 45, 67, 90, 21]],
                   'day5': [23, 54, 36]})
explode_df
city day1 day2 day3 day4 day5
0 A 22 31 27 34 23
1 B 25 12 20 37 54
2 C 21 67 15 [41, 45, 67, 90, 21] 36
explode_df.explode("day4", ignore_index=False)
city day1 day2 day3 day4 day5
0 A 22 31 27 34 23
1 B 25 12 20 37 54
2 C 21 67 15 41 36
3 C 21 67 15 45 36
4 C 21 67 15 67 36
5 C 21 67 15 90 36
6 C 21 67 15 21 36

Aggregation#

Aggregation functions#

  • mean(): Compute mean of groups

  • sum(): Compute sum of group values

  • size(): Compute group sizes

  • count(): Compute count of group

  • std(): Standard deviation of groups

  • var(): Compute variance of groups

  • sem(): Standard error of the mean of groups

  • first(): Compute first of group values

  • last(): Compute last of group values

  • nth() : Take nth value, or a subset if n is a list

  • min(): Compute min of group values

  • max(): Compute max of group values

agg_df = test_df[test_metrics]
agg_df.count()
AdrActCnt    7213
PriceUSD     7213
dtype: int64
agg_df.nunique()
AdrActCnt    7152
PriceUSD     7162
dtype: int64
agg_df.median()
AdrActCnt    406267.000000
PriceUSD        467.321788
dtype: float64
agg_df.quantile(q=0.10)
AdrActCnt    14612.000000
PriceUSD         5.476572
Name: 0.1, dtype: float64

Groupby#

DataFrame.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=_NoDefault.no_default, observed=False, dropna=True)

group_df = test_df[["date", "year", "asset"] + test_metrics]
group_df.groupby(by="asset").nunique()
date year AdrActCnt PriceUSD
asset
btc 4530 13 4504 4507
eth 2683 8 2667 2658
group_df.groupby(by=["year", "asset"]).nunique()
date AdrActCnt PriceUSD
year asset
2010 btc 167 151 154
2011 btc 365 363 356
2012 btc 366 363 366
2013 btc 365 365 365
2014 btc 365 365 365
... ... ... ... ...
2020 eth 366 365 366
2021 btc 365 364 365
eth 365 365 365
2022 btc 345 345 345
eth 345 344 345

21 rows × 3 columns

Groupby Map#

agg_map = {'AdrActCnt':['count', 'nunique'], 
           'PriceUSD':['max', 'min', lambda x: x.max() - x.min()]
          }
group_df.groupby(by=["year", "asset"]).agg(agg_map)
AdrActCnt PriceUSD
count nunique max min <lambda_0>
year asset
2010 btc 167 151 0.400982 0.050541 0.350442
2011 btc 365 363 29.029921 0.295000 28.734921
2012 btc 366 363 13.755252 4.255239 9.500012
2013 btc 365 365 1134.932231 13.280607 1121.651624
2014 btc 365 365 914.459961 310.442004 604.017957
... ... ... ... ... ... ...
2020 eth 366 365 751.208064 110.328202 640.879862
2021 btc 365 364 67541.755508 29380.693733 38161.061775
eth 365 365 4811.156463 730.914321 4080.242142
2022 btc 345 345 47560.009382 15758.291282 31801.718100
eth 345 344 3832.365610 992.790097 2839.575513

21 rows × 5 columns

Groupby Datetime#

DataFrame.groupby(pd.Grouper(key=”dfgfgdf”, axis=0, freq=’M’))

group_dt_df = test_df.pivot(index="datetime", columns="asset", values=test_metrics)
group_dt_df
AdrActCnt PriceUSD
asset btc eth btc eth
datetime
2010-07-18 00:00:00+00:00 860.0 NaN 0.085840 NaN
2010-07-19 00:00:00+00:00 929.0 NaN 0.080800 NaN
2010-07-20 00:00:00+00:00 936.0 NaN 0.074736 NaN
2010-07-21 00:00:00+00:00 784.0 NaN 0.079193 NaN
2010-07-22 00:00:00+00:00 594.0 NaN 0.058470 NaN
... ... ... ... ...
2022-12-07 00:00:00+00:00 899850.0 517421.0 16848.251824 1232.552454
2022-12-08 00:00:00+00:00 905789.0 528927.0 17231.456322 1281.633052
2022-12-09 00:00:00+00:00 946767.0 1505652.0 17136.149001 1263.208619
2022-12-10 00:00:00+00:00 909506.0 733795.0 17124.214680 1266.771442
2022-12-11 00:00:00+00:00 781176.0 723883.0 17094.360659 1263.069667

4530 rows × 4 columns

group_dt_df.groupby(pd.Grouper(axis=0, freq='M')).last()
AdrActCnt PriceUSD
asset btc eth btc eth
datetime
2010-07-31 00:00:00+00:00 479.0 NaN 0.067546 NaN
2010-08-31 00:00:00+00:00 733.0 NaN 0.060000 NaN
2010-09-30 00:00:00+00:00 640.0 NaN 0.061900 NaN
2010-10-31 00:00:00+00:00 609.0 NaN 0.192500 NaN
2010-11-30 00:00:00+00:00 817.0 NaN 0.208200 NaN
... ... ... ... ...
2022-08-31 00:00:00+00:00 916812.0 467318.0 20024.671606 1551.729221
2022-09-30 00:00:00+00:00 1005499.0 484536.0 19435.194692 1327.075996
2022-10-31 00:00:00+00:00 980614.0 476137.0 20490.858181 1571.203871
2022-11-30 00:00:00+00:00 946754.0 525545.0 17176.898691 1298.039420
2022-12-31 00:00:00+00:00 781176.0 723883.0 17094.360659 1263.069667

150 rows × 4 columns

group_dt_df.groupby(pd.Grouper(axis=0, freq='Y')).first()
AdrActCnt PriceUSD
asset btc eth btc eth
datetime
2010-12-31 00:00:00+00:00 860.0 NaN 0.085840 NaN
2011-12-31 00:00:00+00:00 1071.0 NaN 0.300000 NaN
2012-12-31 00:00:00+00:00 11474.0 NaN 5.294843 NaN
2013-12-31 00:00:00+00:00 38733.0 NaN 13.331371 NaN
2014-12-31 00:00:00+00:00 96516.0 NaN 752.404550 NaN
... ... ... ... ...
2018-12-31 00:00:00+00:00 972783.0 520683.0 13464.653612 756.071766
2019-12-31 00:00:00+00:00 433715.0 227755.0 3808.117832 139.154644
2020-12-31 00:00:00+00:00 524360.0 231794.0 7170.631869 129.963875
2021-12-31 00:00:00+00:00 1001890.0 511250.0 29380.693733 730.914321
2022-12-31 00:00:00+00:00 695722.0 581311.0 47560.009382 3761.059640

13 rows × 4 columns

Rolling/Window#

DataFrame.rolling(window, min_periods=None, center=False, win_type=None, on=None, axis=0, closed=None, step=None, method=’single’)

Shorthand methods:

  • cumsum()

  • pct_change()

rolling_df = test_df.pivot(index="datetime", columns="asset", values=test_metrics)
rolling_df
AdrActCnt PriceUSD
asset btc eth btc eth
datetime
2010-07-18 00:00:00+00:00 860.0 NaN 0.085840 NaN
2010-07-19 00:00:00+00:00 929.0 NaN 0.080800 NaN
2010-07-20 00:00:00+00:00 936.0 NaN 0.074736 NaN
2010-07-21 00:00:00+00:00 784.0 NaN 0.079193 NaN
2010-07-22 00:00:00+00:00 594.0 NaN 0.058470 NaN
... ... ... ... ...
2022-12-07 00:00:00+00:00 899850.0 517421.0 16848.251824 1232.552454
2022-12-08 00:00:00+00:00 905789.0 528927.0 17231.456322 1281.633052
2022-12-09 00:00:00+00:00 946767.0 1505652.0 17136.149001 1263.208619
2022-12-10 00:00:00+00:00 909506.0 733795.0 17124.214680 1266.771442
2022-12-11 00:00:00+00:00 781176.0 723883.0 17094.360659 1263.069667

4530 rows × 4 columns

rolling_df.rolling(5, min_periods=3, center=False).mean()
AdrActCnt PriceUSD
asset btc eth btc eth
datetime
2010-07-18 00:00:00+00:00 NaN NaN NaN NaN
2010-07-19 00:00:00+00:00 NaN NaN NaN NaN
2010-07-20 00:00:00+00:00 908.333333 NaN 0.080459 NaN
2010-07-21 00:00:00+00:00 877.250000 NaN 0.080142 NaN
2010-07-22 00:00:00+00:00 820.600000 NaN 0.075808 NaN
... ... ... ... ...
2022-12-07 00:00:00+00:00 916719.200000 566109.2 16980.268361 1256.446487
2022-12-08 00:00:00+00:00 910106.200000 552708.4 17046.658149 1264.586847
2022-12-09 00:00:00+00:00 930989.400000 727531.4 17049.544952 1261.221877
2022-12-10 00:00:00+00:00 920778.000000 765902.2 17082.117531 1262.870242
2022-12-11 00:00:00+00:00 888617.600000 801935.6 17086.886497 1261.447047

4530 rows × 4 columns

Gaussian Window:

num = 18
std_dev = 4.5
window = scipy.signal.windows.gaussian(num, std=std_dev)
z_score = np.linspace(-num/2/std_dev, num/2/std_dev, num)
plt.plot(z_score, window)
plt.title(r"Gaussian window ($\sigma$=7)")
plt.ylabel("Amplitude")
plt.xlabel("Z-Score")
plt.figure()
<Figure size 640x480 with 0 Axes>
_images/pandas-cheatsheet_121_1.png
<Figure size 640x480 with 0 Axes>

Binning#

cut#

Bins are of equal size (width), but number of entries per bin may not similar.

# returns pd.Series
pd.cut(test_df['PriceUSD'], bins=5).value_counts()
(-67.491, 13508.392]      6401
(13508.392, 27016.733]     281
(40525.074, 54033.415]     211
(27016.733, 40525.074]     208
(54033.415, 67541.756]     112
Name: PriceUSD, dtype: int64

qcut#

Bins are not of equal size (width), but number of entries per bin are similar.

# returns pd.Series
pd.qcut(test_df['PriceUSD'], q=5).value_counts()
(0.0495, 14.711]        1443
(285.218, 975.619]      1443
(7341.99, 67541.756]    1443
(14.711, 285.218]       1442
(975.619, 7341.99]      1442
Name: PriceUSD, dtype: int64

Strings#

Regex cheat sheet here

Python string formatting cookbook

nba_df = pd.read_csv("C:/Users/wsaye/PycharmProjects/CashAppInterview/data/nba.csv")
nba_df.dropna(inplace=True)
nba_df
Name Team Number Position Age Height Weight College Salary
0 Avery Bradley Boston Celtics 0.0 PG 25.0 6-2 180.0 Texas 7730337.0
1 Jae Crowder Boston Celtics 99.0 SF 25.0 6-6 235.0 Marquette 6796117.0
3 R.J. Hunter Boston Celtics 28.0 SG 22.0 6-5 185.0 Georgia State 1148640.0
6 Jordan Mickey Boston Celtics 55.0 PF 21.0 6-8 235.0 LSU 1170960.0
7 Kelly Olynyk Boston Celtics 41.0 C 25.0 7-0 238.0 Gonzaga 2165160.0
... ... ... ... ... ... ... ... ... ...
449 Rodney Hood Utah Jazz 5.0 SG 23.0 6-8 206.0 Duke 1348440.0
451 Chris Johnson Utah Jazz 23.0 SF 26.0 6-6 206.0 Dayton 981348.0
452 Trey Lyles Utah Jazz 41.0 PF 20.0 6-10 234.0 Kentucky 2239800.0
453 Shelvin Mack Utah Jazz 8.0 PG 26.0 6-3 203.0 Butler 2433333.0
456 Jeff Withey Utah Jazz 24.0 C 26.0 7-0 231.0 Kansas 947276.0

364 rows × 9 columns

Search/Replace#

# does not assume regex
# (nba_df["Name"].str.find("a") >= 1).sum()
nba_df["Name"].str.find("J") >= 1
0      False
1      False
3       True
6      False
7      False
       ...  
449    False
451     True
452    False
453    False
456    False
Name: Name, Length: 364, dtype: bool
# assumes regex
# nba_df["Name"].str.contains("^[jJ]")
# nba_df["Name"].str.contains("^(Jo)")
nba_df["Name"].str.extract("^(Jo)")
0
0 NaN
1 NaN
3 NaN
6 Jo
7 NaN
... ...
449 NaN
451 NaN
452 NaN
453 NaN
456 NaN

364 rows × 1 columns

nba_df["Name"].str.replace("^(Jo)", "PORKY", regex=True)
0         Avery Bradley
1           Jae Crowder
3           R.J. Hunter
6      PORKYrdan Mickey
7          Kelly Olynyk
             ...       
449         Rodney Hood
451       Chris Johnson
452          Trey Lyles
453        Shelvin Mack
456         Jeff Withey
Name: Name, Length: 364, dtype: object

Split, Concat#

# assumes regex if len > 1, else literal
nba_df["Name"].str.split(" ")
0      [Avery, Bradley]
1        [Jae, Crowder]
3        [R.J., Hunter]
6      [Jordan, Mickey]
7       [Kelly, Olynyk]
             ...       
449      [Rodney, Hood]
451    [Chris, Johnson]
452       [Trey, Lyles]
453     [Shelvin, Mack]
456      [Jeff, Withey]
Name: Name, Length: 364, dtype: object
# assumes regex if len > 1, else literal
nba_df["Name"].str.split(" ", n=1, expand=True)
0 1
0 Avery Bradley
1 Jae Crowder
3 R.J. Hunter
6 Jordan Mickey
7 Kelly Olynyk
... ... ...
449 Rodney Hood
451 Chris Johnson
452 Trey Lyles
453 Shelvin Mack
456 Jeff Withey

364 rows × 2 columns

nba_df["Name"].str.split(" ").str.len().describe()
count    364.000000
mean       2.032967
std        0.220220
min        2.000000
25%        2.000000
50%        2.000000
75%        2.000000
max        5.000000
Name: Name, dtype: float64
nba_df["Name"].str.cat(others=[nba_df["Team"], nba_df["College"]], sep="-")
0            Avery Bradley-Boston Celtics-Texas
1          Jae Crowder-Boston Celtics-Marquette
3      R.J. Hunter-Boston Celtics-Georgia State
6              Jordan Mickey-Boston Celtics-LSU
7           Kelly Olynyk-Boston Celtics-Gonzaga
                         ...                   
449                  Rodney Hood-Utah Jazz-Duke
451              Chris Johnson-Utah Jazz-Dayton
452               Trey Lyles-Utah Jazz-Kentucky
453               Shelvin Mack-Utah Jazz-Butler
456                Jeff Withey-Utah Jazz-Kansas
Name: Name, Length: 364, dtype: object

Formatting#

# nba_df["Name"].str.upper()
nba_df["Name"].str.lower()
0      avery bradley
1        jae crowder
3        r.j. hunter
6      jordan mickey
7       kelly olynyk
           ...      
449      rodney hood
451    chris johnson
452       trey lyles
453     shelvin mack
456      jeff withey
Name: Name, Length: 364, dtype: object
nba_df['Salary_str'] = (nba_df['Salary']/10**6).map('${:,.2f}M'.format)  # for single Series
nba_df['Salary_str']
# cohort_crosstab = cohort_crosstab.applymap('{:,.0f}%'.format)  # for entire df
0      $7.73M
1      $6.80M
3      $1.15M
6      $1.17M
7      $2.17M
        ...  
449    $1.35M
451    $0.98M
452    $2.24M
453    $2.43M
456    $0.95M
Name: Salary_str, Length: 364, dtype: object

Modeling#

Linear Regression#

lin_reg_df = pd.read_csv('C:/Users/wsaye/PycharmProjects/CashAppInterview/data/lin_reg_test_data.csv')  # load data set
X = lin_reg_df.iloc[:, 0].values.reshape(-1, 1)  # values converts it into a numpy array
Y = lin_reg_df.iloc[:, 1].values.reshape(-1, 1)  # -1 means that calculate the dimension of rows, but have 1 column
COLOR = lin_reg_df.iloc[:, 2].values.reshape(-1, 1)

ohe = OneHotEncoder(sparse_output=False)
ohe_vals = ohe.fit_transform(COLOR)

X_mat = np.concatenate([X, ohe_vals], axis=1)
def regression_results(y_true, y_pred, lin_reg):

    # Regression metrics
    explained_variance = metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error = metrics.mean_absolute_error(y_true, y_pred) 
    mse = metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error = metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error = metrics.median_absolute_error(y_true, y_pred)
    r2 = metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r^2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))
    print(f"Coefficients: {lin_reg.coef_}")
    print(f"Intercept: {lin_reg.intercept_}")

Test/Train Split#

X_train, X_test, y_train, y_test = train_test_split(X_mat, Y, test_size = 0.25)

Fit, Predict, Summarize#

regr = LinearRegression()

regr.fit(X_train, y_train)
y_test_pred = regr.predict(X_test)
lin_reg_df["y_pred"] = regr.predict(X_mat)

regression_results(y_test, y_test_pred, regr)
explained_variance:  0.9397
mean_squared_log_error:  0.0248
r^2:  0.9381
MAE:  10.1719
MSE:  151.7468
RMSE:  12.3186
Coefficients: [[   1.38349831  112.5431928  -125.45870138   12.91550859]]
Intercept: [-8.26839901]

K-Means#

from sklearn.cluster import KMeans

X_ktrain, X_ktest, y_ktrain, y_ktest = train_test_split(X, Y, test_size = 0.25)

train_set = np.concatenate([X_ktrain, y_ktrain], axis=1)
test_set = np.concatenate([X_ktest, y_ktest], axis=1)

kmeans = KMeans(n_clusters=3, random_state=0, n_init="auto").fit(train_set)
train_labels = kmeans.labels_
test_labels = kmeans.predict(test_set)

Plot Model:

fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Scatter(x=X_ktrain[:, 0], y=y_ktrain[:, 0], mode='markers', name='train',
                         marker=dict(color=train_labels, line_color="black", line_width=1)), row=1, col=1)
fig.add_trace(go.Scatter(x=X_ktest[:, 0], y=y_ktest[:, 0], mode='markers', name='test',
                         marker=dict(color=test_labels, line_color="black", line_width=1, symbol="x")), row=1, col=2)
fig.show()

Plotting#

Matplotlib#

# plt.scatter(X, Y)
# plt.plot(X, Y_pred, color="red", linestyle='None', marker="x")
plt.scatter(lin_reg_df["x"], lin_reg_df["y"])
plt.plot(lin_reg_df["x"], lin_reg_df["y_pred"], color="green", linestyle='None', marker="x")

plt.show()
_images/pandas-cheatsheet_153_0.png

Plotly (Standard)#

fig = go.Figure()
fig.add_trace(go.Scatter(x=X[:, 0], y=Y[:, 0], mode='markers', name='raw data', marker=dict(color="grey")))
# fig.add_trace(go.Scatter(x=X[:, 0], y=Y_pred[:, 0], mode='markers', name='prediction', marker=dict(color=COLOR[:, 0])))

for c in list(np.unique(COLOR[:, 0])):
    temp_x = lin_reg_df.loc[lin_reg_df["color"]==c, "x"]
    temp_y = lin_reg_df.loc[lin_reg_df["color"]==c, "y_pred"]
    fig.add_trace(go.Scatter(x=temp_x, y=temp_y, mode='lines', name='pred-' + c, line_color=c))

fig.show()

Plotly Express#

fig = px.scatter(lin_reg_df, x="x", y="y", color="color")
fig.show()
fig = px.scatter(lin_reg_df, x="x", y="y", color="color", 
                 facet_col="color", 
#                  facet_row="time", 
#                  trendline="ols"
                )
fig.show()

Sample Analyses#

Cohort#

rand_gen = np.random.RandomState(2021)  # set seed

start_date = dt.datetime.strptime("2022-01-01", "%Y-%m-%d")
end_date = dt.datetime.strptime("2022-01-10", "%Y-%m-%d")
date_rng = pd.date_range(start_date, end_date).values

total_days = len(date_rng)
num_users = 1000
user_df_list = []

for u in range(0, num_users):
    num_active_days = rand_gen.randint(low=2, high=total_days)
    active_days_index = rand_gen.randint(low=0, high=total_days, size=(1, num_active_days))
    active_dates = pd.Series(date_rng[active_days_index[0, :]])
    user_id = pd.Series([u]*num_active_days)
    user_df = pd.concat([active_dates, user_id], axis=1)
    user_df.columns = ["date", "user_id"]
    user_df_list.append(user_df)

cohort_df = pd.concat(user_df_list)
cohort_df
date user_id
0 2022-01-06 0
1 2022-01-10 0
2 2022-01-01 0
3 2022-01-07 0
4 2022-01-06 0
... ... ...
2 2022-01-10 999
3 2022-01-06 999
4 2022-01-10 999
5 2022-01-03 999
6 2022-01-10 999

5443 rows × 2 columns

first_date = cohort_df.groupby(by=["user_id"]).min().rename(columns={"date": "start_date"})
cohort_df = cohort_df.join(first_date, on="user_id", how="left")
cohort_df
date user_id start_date
0 2022-01-06 0 2022-01-01
1 2022-01-10 0 2022-01-01
2 2022-01-01 0 2022-01-01
3 2022-01-07 0 2022-01-01
4 2022-01-06 0 2022-01-01
... ... ... ...
2 2022-01-10 999 2022-01-03
3 2022-01-06 999 2022-01-03
4 2022-01-10 999 2022-01-03
5 2022-01-03 999 2022-01-03
6 2022-01-10 999 2022-01-03

5443 rows × 3 columns

cohort_crosstab = pd.crosstab(cohort_df['start_date'], cohort_df['date'])
cohort_totals = np.diag(cohort_crosstab).reshape(-1, 1)

cohort_crosstab[cohort_crosstab.columns] = 100 * cohort_crosstab.values / cohort_totals
cohort_crosstab = cohort_crosstab.applymap('{:,.0f}%'.format)
cohort_crosstab
date 2022-01-01 2022-01-02 2022-01-03 2022-01-04 2022-01-05 2022-01-06 2022-01-07 2022-01-08 2022-01-09 2022-01-10
start_date
2022-01-01 100% 39% 45% 43% 42% 45% 47% 47% 34% 42%
2022-01-02 0% 100% 44% 42% 45% 41% 43% 45% 38% 42%
2022-01-03 0% 0% 100% 46% 32% 39% 31% 42% 37% 40%
2022-01-04 0% 0% 0% 100% 38% 43% 33% 39% 35% 43%
2022-01-05 0% 0% 0% 0% 100% 42% 38% 64% 38% 39%
2022-01-06 0% 0% 0% 0% 0% 100% 24% 37% 44% 44%
2022-01-07 0% 0% 0% 0% 0% 0% 100% 47% 63% 42%
2022-01-08 0% 0% 0% 0% 0% 0% 0% 100% 25% 31%
2022-01-09 0% 0% 0% 0% 0% 0% 0% 0% 100% 167%
2022-01-10 0% 0% 0% 0% 0% 0% 0% 0% 0% 100%

Funnel#

stages = ["Website visit", "Downloads", "Potential customers", "Requested price", "Invoice sent"]
df_mtl = pd.DataFrame(dict(number=[39, 27.4, 20.6, 11, 3], stage=stages))
df_mtl['office'] = 'Montreal'
df_toronto = pd.DataFrame(dict(number=[52, 36, 18, 14, 5], stage=stages))
df_toronto['office'] = 'Toronto'
df = pd.concat([df_mtl, df_toronto], axis=0)

fig = px.funnel(df, x='number', y='stage', color='office')
display(fig)
trace0 = go.Funnel(
        y = stages,
        x = [49, 29, 26, 11, 2], 
        textinfo = "value+percent initial")

# Fill out data with our traces
traces = [trace0]
# Plot it and save as basic-line.html
pyo.iplot(traces, filename = 'funnel_2')